/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: December 2024

Inputs: 	pdb_full_structures.dta
			pdb_full_papers.dta
			pdb_analysis.dta
			phat.dta
			pdb_full_entities.dta
		
Outputs: 

Purpose: Compute spending on clean-up efforts
		 Compute time lags to improved structures

*******************************************************************************/

* Assemble data, keep full sample, flag analysis sample
clear all
use "${data_built}pdb_full_structures.dta"
merge 1:1 structureId using "${data_built}pdb_analysis.dta"
assert _merge != 2
gen analysisSample = (_merge == 3)
drop _merge

merge m:1 pubmedId using "${data_built}pdb_full_papers.dta"
assert _merge != 2
drop _merge

merge 1:1 structureId using "${data_built}phat.dta"
assert _merge != 2
drop _merge

* Compute standardized measures in the full sample, using the mean and SD 
* from the analysis sample, so that magnitudes are comparable

	* Loop over each quality measure
	foreach Q in refinementResolution rFree ramaRaw {
		sum `Q' if analysisSample == 1
		local mean = r(mean)
		local sd = r(sd)
		gen `Q'StdFull = -(`Q' - `mean')/`sd'
	}
	
	* Generate the quality index 
	* (no need to standardize, since only used for comparison across structures)
	gen indexFull = refinementResolutionStdFull + rFreeStdFull + ramaRawStdFull
	
* Define if a structure is an improvement to the original structure

	preserve
	
	* Merge on entities, since similarity defined at entity level
	merge 1:m structureId using "${data_built}pdb_full_entities.dta", 	///
	keepusing(clusterNum100 priorityEntity)
	assert _merge != 1
	keep if _merge == 3
	drop _merge
	drop if clusterNum100 == .

	* Create a variable for the quality of the priority entity
	* Note: sometimes multiple priority entities due to ties (same authors)
	* In that case, take the max quality of all the priority entities
	gen temp = .
	replace temp = indexFull if priorityEntity == 1
	bys clusterNum100: egen indexPriority = max(temp) // max() ignores missing
	drop temp
	
	* Create a dummy for whether entity is an improvement over priority entity
	gen entityImprovement = (indexFull > indexPriority & 					///
	indexFull != . & indexPriority != .)
	assert entityImprovement == 0 if priorityEntity == 1
	assert entityImprovement == 0 if indexPriority == .
	assert entityImprovement == 0 if indexFull == .
	keep if priorityEntity == 0
	
	* Collapse back to structure level
	rename releaseDate improvementDate
	collapse (mean) entityImprovement, by(structureId)
	tempfile improvement
	save `improvement'
	
	restore
	
	* Merge back on to structure data
	merge 1:1 structureId using `improvement'
	assert _merge != 2
	drop _merge
	gen fullImprovement = (entityImprovement == 1)
	gen anyImprovement = (entityImprovement > 0 & entityImprovement != .)

/*------------------------------------------------------------------------------

	Table 6: Costs of Structure Improvement

------------------------------------------------------------------------------*/

* Initialize matrix to store results
matrix structure_counts = J(4,1,.)
local row = 1
local col = 1

* Only count x-ray structures
keep if experimentalTechnique == "X-RAY DIFFRACTION" 

* Costs of duplication under different definitions:
	
	count if priorityStructure == 0
	matrix structure_counts[`row',`col'] = r(N)
	local ++row
	
	count if priorityStructure == 0 & priorityRace == 0
	matrix structure_counts[`row',`col'] = r(N)
	local ++row
	
	count if priorityStructure == 0 & priorityRace == 0 & anyImprovement == 1
	matrix structure_counts[`row',`col'] = r(N)
	local ++row
	
	count if priorityStructure == 0 & priorityRace == 0 & fullImprovement == 1
	matrix structure_counts[`row',`col'] = r(N)

preserve
	clear
	svmat structure_counts
	export excel "${tables}Tables.xlsx", sheet(table6) cell(D60) sheetmodify
restore	

